-
Notifications
You must be signed in to change notification settings - Fork 45
[perf] Do not make compiler life harder #164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
|
will retest on linux |
|
I'm surprised to see such a big difference. I've run similar benchmarks on Intel Mac, and while I didn't see a 10x improvement, the difference is still huge (about 5x faster). |
|
Can you please run clang-format with style Google? |
|
On a Linux VM the difference is not so big but sill 2x faster in some cases. |
|
This might be affected by both clang++/g++ and arm64/x86-64. The most common case on servers is likely g++ and x86-64 on Linux. Will do that (if no one would beat me to it) |
|
side note: on Mac, an optimized (not the current) JS impl for |
This is the VM env mentioned above, but it is virtualized.
The JS implementation is used for very small buffers where the cost of calling the native bindings isn't worth the effort, so it does not really matter. |
|
While reading the changes I also noticed that it is time to use |
|
@lpinca I was considering replacing native with js as new js was 1.5x faster, this is why it mattered 🙃
That is broken in Node.js unfortunately for now. But yes, it makes sense to support that here. And as for perf, I'll get to my x86-64 / Linux machine soon to test this locally. |
|
Here are two versions which could be explored in Godbolt: Before #include <cstddef>
#include <cstdint>
struct Args0 {
uint8_t *source;
uint8_t *mask;
uint8_t *destination;
uint32_t offset;
uint32_t length;
};
struct Args1 {
uint8_t *source;
size_t length;
uint8_t *mask;
};
void* Mask(Args0 args0) {
uint8_t *source = args0.source;
uint8_t *mask = args0.mask;
uint8_t *destination = args0.destination;
uint32_t offset = args0.offset;
uint32_t length = args0.length;
destination += offset;
uint32_t index = 0;
//
// Alignment preamble.
//
while (index < length && ((size_t)source % 8)) {
*destination++ = *source++ ^ mask[index % 4];
index++;
}
length -= index;
if (!length)
return NULL;
//
// Realign mask and convert to 64 bit.
//
uint8_t maskAlignedArray[8];
for (uint8_t i = 0; i < 8; i++, index++) {
maskAlignedArray[i] = mask[index % 4];
}
//
// Apply 64 bit mask in 8 byte chunks.
//
uint32_t loop = length / 8;
uint64_t *pMask8 = (uint64_t *)maskAlignedArray;
while (loop--) {
uint64_t *pFrom8 = (uint64_t *)source;
uint64_t *pTo8 = (uint64_t *)destination;
*pTo8 = *pFrom8 ^ *pMask8;
source += 8;
destination += 8;
}
//
// Apply mask to remaining data.
//
uint8_t *pmaskAlignedArray = maskAlignedArray;
length %= 8;
while (length--) {
*destination++ = *source++ ^ *pmaskAlignedArray++;
}
return NULL;
}
void* Unmask(Args1 args1) {
uint8_t *source = args1.source;
size_t length = args1.length;
uint8_t *mask = args1.mask;
uint32_t index = 0;
//
// Alignment preamble.
//
while (index < length && ((size_t)source % 8)) {
*source++ ^= mask[index % 4];
index++;
}
length -= index;
if (!length)
return NULL;
//
// Realign mask and convert to 64 bit.
//
uint8_t maskAlignedArray[8];
for (uint8_t i = 0; i < 8; i++, index++) {
maskAlignedArray[i] = mask[index % 4];
}
//
// Apply 64 bit mask in 8 byte chunks.
//
uint32_t loop = length / 8;
uint64_t *pMask8 = (uint64_t *)maskAlignedArray;
while (loop--) {
uint64_t *pSource8 = (uint64_t *)source;
*pSource8 ^= *pMask8;
source += 8;
}
//
// Apply mask to remaining data.
//
uint8_t *pmaskAlignedArray = maskAlignedArray;
length %= 8;
while (length--) {
*source++ ^= *pmaskAlignedArray++;
}
return NULL;
}After #include <cstddef>
#include <cstdint>
struct Args0 {
uint8_t *source;
uint8_t *mask;
uint8_t *destination;
uint32_t offset;
uint32_t length;
};
struct Args1 {
uint8_t *source;
size_t length;
uint8_t *mask;
};
void* Mask(Args0 args0) {
uint8_t *source = args0.source;
uint8_t *mask = args0.mask;
uint8_t *destination = args0.destination;
uint32_t offset = args0.offset;
uint32_t length = args0.length;
destination += offset;
uint32_t index = 0;
//
// Alignment preamble.
//
while (index < length && ((size_t)source % 8)) {
*destination++ = *source++ ^ mask[index % 4];
index++;
}
length -= index;
if (!length)
return NULL;
//
// Realign mask and convert to 64 bit.
//
uint8_t maskAlignedArray[8];
for (uint8_t i = 0; i < 8; i++, index++) {
maskAlignedArray[i] = mask[index % 4];
}
//
// Apply 64 bit mask in 8 byte chunks.
//
uint32_t loop = length / 8;
uint64_t mask8 = ((uint64_t *)maskAlignedArray)[0];
uint64_t *pFrom8 = (uint64_t *)source;
uint64_t *pTo8 = (uint64_t *)destination;
for (uint32_t i = 0; i < loop; i++) pTo8[i] = pFrom8[i] ^ mask8;
source += 8 * loop;
destination += 8 * loop;
//
// Apply mask to remaining data.
//
length %= 8;
for (uint32_t i = 0; i < length; i++) {
destination[i] = source[i] ^ maskAlignedArray[i];
}
return NULL;
}
void* Unmask(Args1 args1) {
uint8_t *source = args1.source;
uint8_t *mask = args1.mask;
size_t length = args1.length;
uint32_t index = 0;
//
// Alignment preamble.
//
while (index < length && ((size_t)source % 8)) {
*source++ ^= mask[index % 4];
index++;
}
length -= index;
if (!length)
return NULL;
//
// Realign mask and convert to 64 bit.
//
uint8_t maskAlignedArray[8];
for (uint8_t i = 0; i < 8; i++, index++) {
maskAlignedArray[i] = mask[index % 4];
}
//
// Apply 64 bit mask in 8 byte chunks.
//
uint32_t loop = length / 8;
uint64_t mask8 = ((uint64_t *)maskAlignedArray)[0];
uint64_t *pSource8 = (uint64_t *)source;
for (uint32_t i = 0; i < loop; i++) pSource8[i] ^= mask8;
source += 8 * loop;
//
// Apply mask to remaining data.
//
length %= 8;
for (uint32_t i = 0; i < length; i++) {
source[i] ^= maskAlignedArray[i];
}
return NULL;
} |
|
g++ on x86_64 with mov QWORD PTR [rsp-16], rcx
test r8d, r8d
je .L71
cmp r8d, 1
je .L77
mov edi, r8d
movq xmm1, rcx
mov rdx, rax
shr edi
punpcklqdq xmm1, xmm1
sal rdi, 4
add rdi, rax
.L73:
movdqu xmm0, XMMWORD PTR [rdx]
add rdx, 16
pxor xmm0, xmm1
movups XMMWORD PTR [rdx-16], xmm0
cmp rdx, rdi
jne .L73
test r8b, 1
je .L74
movabs rdx, 34359738352
and rdx, rsi
add rdx, rax
.L72:
xor QWORD PTR [rdx], rcx
.L74:g++ on x86_64 with mov r8, QWORD PTR [rsp-24]
cmp esi, 1
je .L68
mov ecx, esi
movq xmm1, r8
mov rdx, rax
shr ecx
punpcklqdq xmm1, xmm1
sal rcx, 4
add rcx, rax
.L64:
movdqu xmm0, XMMWORD PTR [rdx]
add rdx, 16
pxor xmm0, xmm1
movups XMMWORD PTR [rdx-16], xmm0
cmp rdx, rcx
jne .L64
test sil, 1
je .L62
mov edx, esi
and edx, -2
.L63:
xor QWORD PTR [rax+rdx*8], r8
.L62:The main loop is identical. |
|
clang++ on arm8-a with -O3 Before: b.hs .LBB1_26
tbz w9, #0, .LBB1_26
mov x9, x8
b .LBB1_29
.LBB1_26:
ldr d0, [sp, #8]
and x13, x11, #0xfffffffc
add x9, x8, x13, lsl #3
sub w11, w11, w13
add x8, x8, #16
dup v0.2d, v0.d[0]
mov x14, x13
.LBB1_27:
ldp q1, q2, [x8, #-16]
subs x14, x14, #4
eor v1.16b, v1.16b, v0.16b
eor v2.16b, v2.16b, v0.16b
stp q1, q2, [x8, #-16]
add x8, x8, #32
b.ne .LBB1_27
cmp x12, x13
b.eq .LBB1_30
.LBB1_29:
ldr x8, [sp, #8]
ldr x12, [x9]
subs w11, w11, #1
eor x8, x12, x8
str x8, [x9], #8
b.ne .LBB1_29
.LBB1_30:After: b.hs .LBB1_22
mov x16, xzr
b .LBB1_25
.LBB1_22:
lsr x16, x13, #3
dup v0.2d, x14
add x17, x8, #16
and x16, x16, #0xfffffffc
mov x18, x16
.LBB1_23:
ldp q1, q2, [x17, #-16]
subs x18, x18, #4
eor v1.16b, v1.16b, v0.16b
eor v2.16b, v2.16b, v0.16b
stp q1, q2, [x17, #-16]
add x17, x17, #32
b.ne .LBB1_23
cmp x15, x16
b.eq .LBB1_27
.LBB1_25:
add x17, x8, x16, lsl #3
sub x15, x16, x15
.LBB1_26:
ldr x16, [x17]
adds x15, x15, #1
eor x16, x16, x14
str x16, [x17], #8
b.lo .LBB1_26
.LBB1_27: |
Loops with bodies not depending on previous iterations are easier
Tested on M3 (please recheck on other platforms)
Before:
After
Has to be retested on smth else